home *** CD-ROM | disk | FTP | other *** search
- package horst.parser;
-
- import java.io.BufferedOutputStream;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.net.URL;
- import java.net.URLConnection;
-
- public class HTMLParser implements ParserCallback {
- private static final int TEXT_STATE = 1;
- private static final int TAG_STATE = 2;
- private static final int COMMENT_STATE = 3;
- private static final int TAGQUOTE_STATE = 5;
- private static final int ENTITYREF_STATE = 6;
- private static final int SCRIPT_TAG_STATE = 7;
- private static final int CARRIAGE_RETURN = 13;
- private static final int NEWLINE = 10;
- private static final int TAB = 9;
- private static final char SINGLEQUOTE = '\'';
- private static final char DOUBLEQUOTE = '"';
- private static final int TYPE_LEN = 256;
- private static final byte T_WHITESPACE = 1;
- private static char[] m_scriptTag = new char[9];
- private static byte[] m_ctype = new byte[256];
- private static Escapes m_escapes = new Escapes();
- ParserCallback m_callback = this;
- Reader m_in;
- int m_lineNum = 0;
- int m_tagquote;
- int m_comment;
- int m_script;
- StringBuffer m_buf = new StringBuffer();
- StringBuffer m_entityBuffer = new StringBuffer();
- StringBuffer m_scriptBuffer = new StringBuffer();
- int m_parserState = 1;
- int m_ch;
- boolean m_bSpace;
- boolean m_bPreformat;
- boolean m_bNoBreakSpace;
- BufferedOutputStream m_out;
- TagGenerator m_generator = new HTMLTags();
- boolean m_bPlainText;
- Tag m_lastTag;
-
- static {
- m_scriptTag[0] = '<';
- m_scriptTag[1] = '/';
- m_scriptTag[2] = 's';
- m_scriptTag[3] = 'c';
- m_scriptTag[4] = 'r';
- m_scriptTag[5] = 'i';
- m_scriptTag[6] = 'p';
- m_scriptTag[7] = 't';
- m_scriptTag[8] = '>';
- int len = m_ctype.length;
-
- for(int i = 0; i < len; ++i) {
- m_ctype[i] = 0;
- }
-
- m_ctype[32] = 1;
- m_ctype[13] = 1;
- m_ctype[10] = 1;
- m_ctype[9] = 1;
-
- for(int i = 14; i <= 31; ++i) {
- m_ctype[i] = 1;
- }
-
- }
-
- private void beginParsing(URLConnection con) throws IOException {
- String contentType = con.getContentType();
- if (contentType != null) {
- this.m_bPlainText = contentType.equals("text/plain");
- }
-
- InputStream istream = con.getInputStream();
- this.m_in = new BufferedReader(new InputStreamReader(istream), 16384);
- this.m_buf = new StringBuffer();
- this.m_entityBuffer = new StringBuffer();
- this.m_scriptBuffer = new StringBuffer();
- System.gc();
- this.m_callback.startingParsing(con.getURL());
- if (this.m_bPlainText) {
- this.m_callback.handleTag(this.m_generator.getTag("html"), true);
- this.m_callback.handleTag(this.m_generator.getTag("pre"), true);
- }
-
- this.parseContent();
- if (this.m_bPlainText) {
- this.m_callback.handleTag(this.m_generator.getTag("html"), false);
- this.m_callback.handleTag(this.m_generator.getTag("pre"), false);
- }
-
- this.m_callback.finishedParsing();
- this.m_callback = null;
- }
-
- private final void callbackContent() {
- this.m_lastTag = null;
- if (this.m_bNoBreakSpace) {
- this.m_bNoBreakSpace = false;
- int i = 0;
- int buflen = this.m_buf.length();
-
- int buflen_minus1;
- for(buflen_minus1 = buflen - 1; i < buflen && !Character.isWhitespace(this.m_buf.charAt(i)); ++i) {
- }
-
- if (i == buflen_minus1 && buflen > 1) {
- StringBuffer temp = new StringBuffer();
- temp.append(this.m_buf.charAt(buflen_minus1));
- this.m_buf.setLength(buflen_minus1);
- if (this.m_buf.length() > 0) {
- this.m_callback.handleContent(this.m_buf);
- }
-
- if (temp.length() > 0) {
- this.m_callback.handleContent(temp);
- }
-
- this.m_buf.setLength(0);
- return;
- }
-
- if (i < buflen - 1 && buflen > 1) {
- StringBuffer temp = new StringBuffer();
-
- for(int j = i + 1; j < buflen; ++j) {
- temp.append(this.m_buf.charAt(j));
- }
-
- this.m_buf.setLength(i + 1);
- this.m_callback.handleContent(this.m_buf);
- this.m_buf.setLength(0);
- if (temp.length() > 0) {
- this.m_callback.handleContent(temp);
- }
-
- return;
- }
-
- this.m_callback.handleContent(this.m_buf);
- this.m_buf.setLength(0);
- } else {
- int buflen = this.m_buf.length();
-
- int i;
- for(i = buflen - 1; i >= 0 && !Character.isWhitespace(this.m_buf.charAt(i)); --i) {
- }
-
- if (i < 0) {
- this.m_callback.handleContent(this.m_buf);
- } else {
- StringBuffer temp = new StringBuffer();
- if (i == buflen - 1) {
- temp.append(this.m_buf.charAt(i));
- if (buflen > 1) {
- this.m_buf.setLength(buflen - 1);
- this.m_callback.handleContent(this.m_buf);
- }
-
- this.m_callback.handleContent(temp);
- } else {
- for(int j = i + 1; j <= buflen - 1; ++j) {
- temp.append(this.m_buf.charAt(j));
- }
-
- this.m_buf.setLength(i + 1);
- this.m_callback.handleContent(this.m_buf);
- if (temp.length() > 0) {
- this.m_callback.handleContent(temp);
- }
- }
- }
-
- this.m_buf.setLength(0);
- }
-
- }
-
- private int checkRenderable(int c) {
- switch (c) {
- case 146:
- case 8217:
- return 39;
- case 149:
- return 42;
- case 8212:
- return 45;
- case 8226:
- return 183;
- default:
- return c;
- }
- }
-
- public void finishedParsing() {
- }
-
- public File getSourceFile(URL u, String path) {
- File f = new File(path);
-
- try {
- FileOutputStream fs = new FileOutputStream(f);
- this.m_out = new BufferedOutputStream(fs);
- this.parse(u.openConnection());
- this.m_out.close();
- } catch (IOException e) {
- System.out.println("IOException getSourceFile: " + e);
- f = null;
- } finally {
- this.m_out = null;
- }
-
- return f;
- }
-
- public void handleContent(StringBuffer txt) {
- }
-
- public void handleNoBreakSpace() {
- }
-
- public void handleTag(Tag t, boolean bStartTag) {
- }
-
- private void insertLineBreak() {
- this.m_callback.handleTag(this.m_generator.getTag(HTMLDefs.getName(7)), true);
- }
-
- private boolean isNewlineCharacter(int c) {
- switch (c) {
- case 10:
- case 13:
- return true;
- default:
- return false;
- }
- }
-
- private boolean isSpace(int c) {
- return c >= 0 && c < 256 ? (m_ctype[c] & 1) != 0 : false;
- }
-
- private boolean notifyTagEncountered() {
- Tag tag = null;
- String buf = this.m_buf.toString();
- int len = buf.length();
- int idx = 0;
-
- for(int begin = 0; idx < len && this.isSpace(buf.charAt(idx)); ++idx) {
- }
-
- if (idx == len) {
- return false;
- } else {
- int var9;
- for(var9 = idx; idx < len && !this.isSpace(buf.charAt(idx)); ++idx) {
- }
-
- String token = buf.substring(var9, idx);
- boolean bStartTag = token.charAt(0) != '/';
- if (!bStartTag) {
- tag = this.m_generator.getTag(token.substring(1));
- } else {
- tag = this.m_generator.getTag(token);
- }
-
- if (tag == null) {
- return false;
- } else {
- while(idx < len && this.isSpace(buf.charAt(idx))) {
- ++idx;
- }
-
- if (idx != len) {
- this.parseAttributes(tag, buf, idx);
- }
-
- if (tag.getID() == 28) {
- this.insertLineBreak();
- }
-
- if (bStartTag) {
- this.m_callback.handleTag(tag, true);
- } else {
- this.m_callback.handleTag(tag, false);
- }
-
- if (this.m_bSpace && this.m_lastTag != null && this.m_lastTag.breaksFlow()) {
- this.m_bSpace = false;
- }
-
- this.m_lastTag = tag;
- if (this.m_bPreformat && !bStartTag && tag.getID() == 28) {
- this.m_bPreformat = false;
- } else if (bStartTag && tag.getID() == 28) {
- this.m_bPreformat = true;
- }
-
- if (this.m_lastTag.getID() == 49 && bStartTag) {
- this.m_parserState = 7;
- } else {
- this.m_parserState = 1;
- }
-
- return true;
- }
- }
- }
-
- public void parse(Reader r, URL baseURL) throws IOException {
- this.m_in = r;
- this.m_buf.setLength(0);
- this.m_callback.startingParsing((URL)null);
- this.parseContent();
- this.m_callback.finishedParsing();
- }
-
- public void parse(URL u) throws IOException {
- this.beginParsing(u.openConnection());
- }
-
- public void parse(URLConnection con) throws IOException {
- this.beginParsing(con);
- }
-
- private void parseAttributes(Tag tag, String buf, int idx) {
- int len = buf.length();
- int begin = 0;
-
- while(true) {
- String name;
- int end;
- while(true) {
- while(true) {
- label114:
- while(true) {
- if (idx >= len) {
- return;
- }
-
- while(idx < len && this.isSpace(buf.charAt(idx))) {
- ++idx;
- }
-
- if (idx != len) {
- begin = idx;
- if (buf.charAt(idx) == '"') {
- ++idx;
-
- while(idx < len && buf.charAt(idx) != '"') {
- ++idx;
- }
-
- if (idx != len) {
- ++idx;
- break;
- }
- } else {
- if (buf.charAt(idx) != '\'') {
- while(true) {
- if (idx >= len || this.isSpace(buf.charAt(idx)) || buf.charAt(idx) == '=') {
- break label114;
- }
-
- ++idx;
- }
- }
-
- ++idx;
-
- while(idx < len && buf.charAt(idx) != '\'') {
- ++idx;
- }
-
- if (idx != len) {
- ++idx;
- break;
- }
- }
- }
- }
-
- name = buf.substring(begin, idx).toLowerCase();
- if (idx < len && this.isSpace(buf.charAt(idx))) {
- while(idx < len && this.isSpace(buf.charAt(idx))) {
- ++idx;
- }
- }
-
- if (idx != len && buf.charAt(idx) == '=') {
- ++idx;
- if (idx != len) {
- if (buf.charAt(idx) != ' ') {
- break;
- }
-
- while(idx < len && this.isSpace(buf.charAt(idx))) {
- ++idx;
- }
-
- if (idx != len) {
- break;
- }
- }
- } else if (tag != null) {
- tag.setAttribute(name, "");
- }
- }
-
- begin = idx;
- if (buf.charAt(idx) == '"') {
- ++idx;
-
- for(begin = idx; idx < len && buf.charAt(idx) != '"'; ++idx) {
- }
-
- if (idx != len) {
- end = idx++;
- break;
- }
- } else {
- if (buf.charAt(idx) == '\'') {
- ++idx;
-
- for(begin = idx; idx < len && buf.charAt(idx) != '\''; ++idx) {
- }
-
- if (idx == len) {
- continue;
- }
-
- end = idx++;
- break;
- }
-
- while(idx < len && !this.isSpace(buf.charAt(idx))) {
- ++idx;
- }
-
- end = idx;
- break;
- }
- }
-
- String value = buf.substring(begin, end);
- if (tag != null) {
- tag.setAttribute(name, value);
- }
- }
- }
-
- private void parseComment() {
- if (this.m_ch == 62 && this.m_comment >= 2) {
- this.m_buf.setLength(0);
- this.m_comment = 0;
- this.m_parserState = 1;
- } else {
- if (this.m_ch == 45) {
- ++this.m_comment;
- } else {
- this.m_comment = 0;
- }
-
- }
- }
-
- private void parseContent() throws IOException {
- boolean bPreformated = false;
- this.m_bSpace = false;
-
- while(true) {
- this.readChar();
- if (this.m_ch < 0) {
- int state = this.m_parserState;
- if (this.m_buf.length() > 0 && state == 1) {
- this.m_callback.handleContent(this.m_buf);
- }
-
- return;
- }
-
- this.m_ch = this.checkRenderable(this.m_ch);
- switch (this.m_parserState) {
- case 1:
- this.parseText();
- continue;
- case 2:
- this.parseTag();
- continue;
- case 3:
- this.parseComment();
- case 4:
- default:
- continue;
- case 5:
- if (this.m_ch == 62) {
- this.m_parserState = 2;
- } else {
- this.m_buf.append((char)this.m_ch);
- if (this.m_ch == this.m_tagquote) {
- this.m_parserState = 2;
- }
- }
- continue;
- case 6:
- this.parseEntityReference();
- continue;
- case 7:
- }
-
- int i;
- switch (this.m_ch) {
- case 60:
- if (this.m_script == 0) {
- ++this.m_script;
- this.m_scriptBuffer.append((char)this.m_ch);
- continue;
- }
-
- this.m_script = 0;
- if (this.m_scriptBuffer.length() > 0) {
- for(int i = 0; i < this.m_scriptBuffer.length(); ++i) {
- this.m_buf.append(this.m_scriptBuffer.charAt(i));
- }
-
- this.m_scriptBuffer.setLength(0);
- this.m_buf.append((char)this.m_ch);
- }
- continue;
- default:
- if (this.m_script <= 0) {
- this.m_buf.append((char)this.m_ch);
- continue;
- }
-
- if (m_scriptTag[this.m_script] == Character.toLowerCase((char)this.m_ch)) {
- if (this.m_script == m_scriptTag.length - 1) {
- this.m_script = 0;
- this.m_scriptBuffer.setLength(0);
- if (this.m_buf.length() > 0) {
- this.callbackContent();
- }
-
- this.m_callback.handleTag(this.m_generator.getTag(HTMLDefs.getName(49)), false);
- this.m_parserState = 1;
- } else {
- this.m_scriptBuffer.append((char)this.m_ch);
- ++this.m_script;
- }
- continue;
- }
-
- this.m_script = 0;
- i = 0;
- }
-
- while(i < this.m_scriptBuffer.length()) {
- this.m_buf.append(this.m_scriptBuffer.charAt(i));
- ++i;
- }
-
- this.m_scriptBuffer.setLength(0);
- this.m_buf.append((char)this.m_ch);
- }
- }
-
- private void parseEntityReference() throws IOException {
- if (this.m_ch == 35) {
- int n = 0;
- this.readChar();
- if (this.m_ch >= 48 && this.m_ch <= 57) {
- while(this.m_ch >= 48 && this.m_ch <= 57) {
- n = n * 10 + this.m_ch - 48;
- this.readChar();
- }
-
- if (n == 9) {
- for(int i = 0; i < 3; ++i) {
- this.m_buf.append(' ');
- }
- } else {
- n = this.checkRenderable(n);
- this.m_buf.append((char)n);
- }
-
- if (this.m_ch == 59) {
- this.m_parserState = 1;
- } else {
- this.resynchronize();
- }
- } else {
- if (this.m_buf.length() == 0 && this.m_lastTag != null && this.m_lastTag.breaksFlow()) {
- this.m_bSpace = false;
- }
-
- if (this.m_bSpace) {
- this.m_buf.append(' ');
- this.m_bSpace = false;
- }
-
- this.m_buf.append('&');
- this.m_buf.append('#');
- this.m_parserState = 1;
- }
- } else {
- if (this.m_buf.length() == 0 && this.m_lastTag != null && this.m_lastTag.breaksFlow()) {
- this.m_bSpace = false;
- }
-
- if (this.m_bSpace) {
- this.m_buf.append(' ');
- this.m_bSpace = false;
- }
-
- if (this.parseIdentifier(false)) {
- Character c = (Character)m_escapes.get(this.m_entityBuffer.toString());
- if (c != null) {
- if (c == ' ') {
- int buflen = this.m_buf.length();
- if (buflen > 0) {
- int pos;
- for(pos = buflen - 1; pos >= 0 && !Character.isWhitespace(this.m_buf.charAt(pos)); --pos) {
- }
-
- if (pos < 0) {
- this.m_callback.handleContent(this.m_buf);
- } else {
- StringBuffer temp = new StringBuffer();
- if (pos == buflen - 1) {
- --pos;
- pos = Math.max(0, pos);
- }
-
- for(int j = pos + 1; j < buflen; ++j) {
- temp.append(this.m_buf.charAt(j));
- }
-
- this.m_buf.setLength(pos + 1);
- this.m_callback.handleContent(this.m_buf);
- if (temp.length() > 0) {
- this.m_callback.handleContent(temp);
- }
- }
- }
-
- this.m_buf.setLength(0);
- this.m_callback.handleNoBreakSpace();
- this.m_bNoBreakSpace = true;
- } else {
- this.m_buf.append(c);
- }
-
- if (this.m_ch == 59) {
- this.m_parserState = 1;
- } else {
- this.resynchronize();
- }
-
- return;
- }
- }
-
- this.m_buf.append('&');
- if (this.m_entityBuffer.length() > 0) {
- char[] dst = new char[this.m_entityBuffer.length()];
- this.m_entityBuffer.getChars(0, this.m_entityBuffer.length(), dst, 0);
- this.m_buf.append(dst);
- }
-
- this.resynchronize();
- }
-
- }
-
- private boolean parseIdentifier(boolean lower) throws IOException {
- this.m_entityBuffer.setLength(0);
- switch (this.m_ch) {
- case 65:
- case 66:
- case 67:
- case 68:
- case 69:
- case 70:
- case 71:
- case 72:
- case 73:
- case 74:
- case 75:
- case 76:
- case 77:
- case 78:
- case 79:
- case 80:
- case 81:
- case 82:
- case 83:
- case 84:
- case 85:
- case 86:
- case 87:
- case 88:
- case 89:
- case 90:
- if (lower) {
- this.m_ch = 97 + (this.m_ch - 65);
- }
- break;
- case 91:
- case 92:
- case 93:
- case 94:
- case 95:
- case 96:
- default:
- return false;
- case 97:
- case 98:
- case 99:
- case 100:
- case 101:
- case 102:
- case 103:
- case 104:
- case 105:
- case 106:
- case 107:
- case 108:
- case 109:
- case 110:
- case 111:
- case 112:
- case 113:
- case 114:
- case 115:
- case 116:
- case 117:
- case 118:
- case 119:
- case 120:
- case 121:
- case 122:
- }
-
- while(true) {
- this.m_entityBuffer.append((char)this.m_ch);
- this.readChar();
- switch (this.m_ch) {
- case 45:
- case 46:
- case 48:
- case 49:
- case 50:
- case 51:
- case 52:
- case 53:
- case 54:
- case 55:
- case 56:
- case 57:
- case 95:
- case 97:
- case 98:
- case 99:
- case 100:
- case 101:
- case 102:
- case 103:
- case 104:
- case 105:
- case 106:
- case 107:
- case 108:
- case 109:
- case 110:
- case 111:
- case 112:
- case 113:
- case 114:
- case 115:
- case 116:
- case 117:
- case 118:
- case 119:
- case 120:
- case 121:
- case 122:
- break;
- case 47:
- case 58:
- case 59:
- case 60:
- case 61:
- case 62:
- case 63:
- case 64:
- case 91:
- case 92:
- case 93:
- case 94:
- case 96:
- default:
- return true;
- case 65:
- case 66:
- case 67:
- case 68:
- case 69:
- case 70:
- case 71:
- case 72:
- case 73:
- case 74:
- case 75:
- case 76:
- case 77:
- case 78:
- case 79:
- case 80:
- case 81:
- case 82:
- case 83:
- case 84:
- case 85:
- case 86:
- case 87:
- case 88:
- case 89:
- case 90:
- if (lower) {
- this.m_ch = 97 + (this.m_ch - 65);
- }
- }
- }
- }
-
- private void parseTag() {
- switch (this.m_ch) {
- case 34:
- case 39:
- this.m_tagquote = this.m_ch;
- this.m_buf.append((char)this.m_ch);
- this.m_parserState = 5;
- break;
- case 60:
- if (this.m_buf.length() > 0) {
- if (this.notifyTagEncountered()) {
- this.m_parserState = 2;
- this.m_buf.setLength(0);
- } else {
- this.m_buf.setLength(0);
- }
- }
- break;
- case 62:
- if (this.m_buf.length() > 0) {
- if (this.notifyTagEncountered()) {
- this.m_parserState = 1;
- } else {
- this.m_parserState = 1;
- }
-
- this.m_buf.setLength(0);
- }
- break;
- default:
- if (this.m_ch == 45 && this.m_buf.length() == 2 && this.m_buf.charAt(1) == '-' && this.m_buf.charAt(0) == '!') {
- this.m_buf.setLength(0);
- this.m_parserState = 3;
- } else {
- this.m_buf.append((char)this.m_ch);
- }
- }
-
- }
-
- private void parseText() {
- if (!this.m_bPlainText && !this.m_bPreformat) {
- switch (this.m_ch) {
- case 38:
- this.m_parserState = 6;
- break;
- case 60:
- this.m_parserState = 2;
- if (this.m_buf.length() > 0) {
- this.callbackContent();
- }
- break;
- default:
- if (this.isSpace(this.m_ch)) {
- this.m_bSpace = true;
- } else {
- if (this.m_buf.length() == 0 && this.m_lastTag != null && this.m_lastTag.breaksFlow()) {
- this.m_bSpace = false;
- }
-
- if (this.m_bSpace) {
- this.m_buf.append(' ');
- this.m_bSpace = false;
- }
-
- this.m_buf.append((char)this.m_ch);
- }
- }
-
- } else {
- switch (this.m_ch) {
- case 9:
- for(int i = 0; i < 3; ++i) {
- this.m_buf.append(' ');
- }
- break;
- case 10:
- if (this.m_buf.length() > 0) {
- this.callbackContent();
- }
-
- this.insertLineBreak();
- case 13:
- break;
- case 38:
- this.m_parserState = 6;
- break;
- case 60:
- if (!this.m_bPlainText) {
- this.m_parserState = 2;
- if (this.m_buf.length() > 0) {
- this.callbackContent();
- }
- }
- break;
- default:
- this.m_buf.append((char)this.m_ch);
- }
-
- }
- }
-
- private void readChar() throws IOException {
- this.m_ch = this.m_in.read();
- if (this.m_out != null && this.m_ch >= 0) {
- this.m_out.write(this.m_ch);
- }
-
- }
-
- private void resynchronize() {
- switch (this.m_ch) {
- case 38:
- this.m_parserState = 6;
- break;
- case 60:
- this.m_parserState = 2;
- if (this.m_buf.length() > 0) {
- if (this.m_bSpace) {
- this.m_buf.append(' ');
- }
-
- this.callbackContent();
- }
- break;
- default:
- if (this.m_bSpace) {
- this.m_buf.append(' ');
- }
-
- this.m_buf.append((char)this.m_ch);
- this.m_parserState = 1;
- }
-
- }
-
- public void setCallback(ParserCallback cb) {
- this.m_bPlainText = false;
- this.m_callback = cb;
- this.m_lineNum = 0;
- this.m_buf.setLength(0);
- this.m_entityBuffer.setLength(0);
- this.m_parserState = 1;
- this.m_bPreformat = false;
- this.m_lastTag = null;
- this.m_bSpace = false;
- }
-
- public void setTagGenerator(TagGenerator generator) {
- if (generator != null) {
- this.m_generator = generator;
- }
-
- }
-
- public void startingParsing(URL u) {
- }
- }
-